Overview

This script is aimed at showing how the Principal Components Analysis (PCA) behaves for sinusoidal patterned data. The idea is that if the gene patterns have high signal to noise variation, then the PC1 vs PC2 plot would be close to a circle. The more we add noise and randomness to the sinusoidal patterns, the less obvious the circular pattern of the PCs would tend to be.

Simulation Example 1 (High SNR scenario)

We present a non-sinusoidal gene patterns scenario now.

library(cellcycleR)
library(wavethresh)
## Loading required package: MASS
## WaveThresh: R wavelet software, release 4.6.6, installed
## 
## Copyright Guy Nason and others 1993-2013
## 
## Note: nlevels has been renamed to nlevelsWT
G <- 100;
num_cells <- 256;
amp_genes <- rep(10, G);
phi_genes <- rep(c(2,4,6,8), each=G/4);
sigma_genes <- rchisq(G, 0.01);
cell_times_sim <- seq(0,2*pi, length.out=num_cells);
cycle_data <- sim_sinusoidal_cycle(G, amp_genes, phi_genes, sigma_genes, cell_times_sim);

celltime_levels <- 256;

sample_reorder <- sample(1:num_cells,num_cells, replace=FALSE);
cell_times_reorder <- cell_times_sim[sample_reorder];
cycle_data_reorder <- cycle_data[sample_reorder,];

PC1 vs PC2

We perform PCA on the reordered data.

pca_cycle <- prcomp(cycle_data_reorder, center=TRUE, scale. = TRUE);
plot(pca_cycle$x[,1], pca_cycle$x[,2], pch=20, lwd=0.01)

Note the PCA maps the points on the circle. We now apply sinusoidal cellcycleR on the data.

system.time(out_sinusoidal <- sin_cell_ordering_class(cycle_data_reorder, celltime_levels = 256, num_iter=100, verbose=FALSE))
## Final loglikelihood, iter 100 : 741912.131013
##    user  system elapsed 
## 144.569  51.390  66.406

cellcycleR: radial plots

We plot the radial plots first.

library(plotrix)
library(RColorBrewer)
radial.plot(lengths=1:length(out_sinusoidal$cell_times),radial.pos=out_sinusoidal$cell_times[order(cell_times_reorder)],
            line.col=colorRampPalette(brewer.pal(9,"Blues"))(length(out_sinusoidal$cell_times)), lwd=2)

Gene patterns

We plot the patterns for two genes (for arbitrary indices)

plot(cycle_data_reorder[order(out_sinusoidal$cell_times),1], type="l", xlab="cell times", ylab="gene expression")

plot(cycle_data[,1], type="l", xlab="cell times", ylab="gene expression")

plot(cycle_data_reorder[order(out_sinusoidal$cell_times),60], type="l", xlab="cell times", ylab="gene expression")

plot(cycle_data[,60], type="l", xlab="cell times", ylab="gene expression")

Rank PCA vs cellcycleR

Now we rank the cells based on the cell times estimated from sinusoidal cellcycleR and then plot the ranks against the ranks based on the first PC.

plot(rank(out_sinusoidal$cell_times), rank(pca_cycle$x[,1]), xlab="sin. cellcycleR rank", ylab="pca 1 rank", pch=20, lwd=0.5, cex=1, col="red")

plot(rank(out_sinusoidal$cell_times), rank(pca_cycle$x[,2]), xlab="sin. cellcycleR rank", ylab="pca 2 rank", pch=20, lwd=0.5, cex=1, col="red")

Simulation example 2 (Moderate SNR scenario)

library(cellcycleR)
library(wavethresh)

G <- 100;
num_cells <- 256;
amp_genes <- rep(1, G);
phi_genes <- rep(c(2,4,6,8), each=G/4);
sigma_genes <- rchisq(G, 3);
cell_times_sim <- seq(0,2*pi, length.out=num_cells);
cycle_data <- sim_sinusoidal_cycle(G, amp_genes, phi_genes, sigma_genes, cell_times_sim);

celltime_levels <- 256;

sample_reorder <- sample(1:num_cells,num_cells, replace=FALSE);
cell_times_reorder <- cell_times_sim[sample_reorder];
cycle_data_reorder <- cycle_data[sample_reorder,];

PC1 vs PC2

We perform PCA on the reordered data.

pca_cycle <- prcomp(cycle_data_reorder, center=TRUE, scale. = TRUE);
plot(pca_cycle$x[,1], pca_cycle$x[,2], pch=20, lwd=0.01)

We apply sinusoidal cellcycleR

system.time(out_sinusoidal <- sin_cell_ordering_class(cycle_data_reorder, celltime_levels = 256, num_iter=100, verbose = FALSE))
## Final loglikelihood, iter 100 : -53835.7277772
##    user  system elapsed 
## 147.811  51.766  67.671

cellcycleR: radial plots

We plot the radial plots first.

library(plotrix)
library(RColorBrewer)
radial.plot(lengths=1:length(out_sinusoidal$cell_times),radial.pos=out_sinusoidal$cell_times[order(cell_times_reorder)],
            line.col=colorRampPalette(brewer.pal(9,"Blues"))(length(out_sinusoidal$cell_times)), lwd=2)

Gene patterns

We plot the patterns for two genes (for arbitrary indices)

plot(cycle_data_reorder[order(out_sinusoidal$cell_times),1], type="l", xlab="cell times", ylab="gene expression")

plot(cycle_data[,1], type="l", xlab="cell times", ylab="gene expression")

plot(cycle_data_reorder[order(out_sinusoidal$cell_times),60], type="l", xlab="cell times", ylab="gene expression")

plot(cycle_data[,60], type="l", xlab="cell times", ylab="gene expression")

Rank PCA vs cellcycleR

Now we rank the cells based on the cell times estimated from sinusoidal cellcycleR and then plot the ranks against the ranks based on the first PC.

plot(rank(out_sinusoidal$cell_times), rank(pca_cycle$x[,1]), xlab="sin. cellcycleR rank", ylab="pca 1 rank", pch=20, lwd=0.5, cex=1, col="red")

plot(rank(out_sinusoidal$cell_times), rank(pca_cycle$x[,2]), xlab="sin. cellcycleR rank", ylab="pca 2 rank", pch=20, lwd=0.5, cex=1, col="red")

Simulation example 3 (Low SNR scenario)

library(cellcycleR)
library(wavethresh)

G <- 100;
num_cells <- 256;
amp_genes <- rep(1, G);
phi_genes <- rep(c(2,4,6,8), each=G/4);
sigma_genes <- rchisq(G, 6);
cell_times_sim <- seq(0,2*pi, length.out=num_cells);
cycle_data <- sim_sinusoidal_cycle(G, amp_genes, phi_genes, sigma_genes, cell_times_sim);

celltime_levels <- 256;

sample_reorder <- sample(1:num_cells,num_cells, replace=FALSE);
cell_times_reorder <- cell_times_sim[sample_reorder];
cycle_data_reorder <- cycle_data[sample_reorder,];

PC1 vs PC2

We perform PCA on the reordered data.

pca_cycle <- prcomp(cycle_data_reorder, center=TRUE, scale. = TRUE);
plot(pca_cycle$x[,1], pca_cycle$x[,2], pch=20, lwd=0.01)

We apply sinusoidal cellcycleR

system.time(out_sinusoidal <- sin_cell_ordering_class(cycle_data_reorder, celltime_levels = 256, num_iter=100, verbose=FALSE))
## Final loglikelihood, iter 100 : -76919.9005907
##    user  system elapsed 
## 149.808  54.153  72.759

cellcycleR: radial plots

We plot the radial plots first.

library(plotrix)
library(RColorBrewer)
radial.plot(lengths=1:length(out_sinusoidal$cell_times),radial.pos=out_sinusoidal$cell_times[order(cell_times_reorder)],
            line.col=colorRampPalette(brewer.pal(9,"Blues"))(length(out_sinusoidal$cell_times)), lwd=2)

Gene patterns

We plot the patterns for two genes (for arbitrary indices)

plot(cycle_data_reorder[order(out_sinusoidal$cell_times),1], type="l", xlab="cell times", ylab="gene expression")

plot(cycle_data[,1], type="l", xlab="cell times", ylab="gene expression")

plot(cycle_data_reorder[order(out_sinusoidal$cell_times),60], type="l", xlab="cell times", ylab="gene expression")

plot(cycle_data[,60], type="l", xlab="cell times", ylab="gene expression")

Rank PCA vs cellcycleR

Now we rank the cells based on the cell times estimated from sinusoidal cellcycleR and then plot the ranks against the ranks based on the first PC.

plot(rank(out_sinusoidal$cell_times), rank(pca_cycle$x[,1]), xlab="sin. cellcycleR rank", ylab="pca 1 rank", pch=20, lwd=0.5, cex=1, col="red")

plot(rank(out_sinusoidal$cell_times), rank(pca_cycle$x[,2]), xlab="sin. cellcycleR rank", ylab="pca 2 rank", pch=20, lwd=0.5, cex=1, col="red")

Conclusion

It seems that when the SNR is high (Example 1), PC1 versus PC2 looks perfectly circular and cellcycleR does pretty well in recovering sinusoidal patterns. However the ranks from PC plots does not correspond to the ranks from the ordered cell times after we re-order the cells by the cellcycleR method. In Exmaple 2, where there is substantial noise, the circular pattern in the plot of PC1 vs PC2 is noisy, however the cellcycleR still seems to recover correct ordering (check radial plots) and the gene patterns look sinusoidal. In Example 3, when there is much higher noise compared to signal, PC plots are very noisy and circular pattern not visible at all. But although gene patterns no longer look sinusoidal, but the cellcycleR cell ordering does not seem too bad (check radial plot).